# Import libraries
library(ggplot2)
library(knitr)
library(kableExtra)

# library(foreach)
# library(doParallel)
# library(tictoc)

library(htmltools)
library(limma) # Bioconductor
library(DESeq2) # Bioconductor
library(dplyr)  # R package
library(latex2exp)
library(RColorBrewer)
library(gplots)
library(calibrate)
library(ggrepel)

baseWD <- "./" #paste0(getwd(),"/")

MultiQC

FastQC

Trimming

Trimming (FastQC)

Alignment

DESeq2 Analysis

1. Reading data

First we read in counts data inside the counts.matrix file, the experimental design (exp_design.csv), and the gene annotation table (gene_annotation.txt).

exp_design <- read.table(paste0(baseWD,"exp_design.csv"), header=T, sep=",")
countData <- read.table(paste0(baseWD,"counts.matrix"), header=T, sep="\t", row.names=1)

# get rid of rows with zero values
countData <- countData[apply(countData, 1, function(countData) !all(countData==0)),]

# Load annotation table
dbs <- read.table(paste0(baseWD,"gene_annotation.txt"), header=FALSE, sep="\t")

dbs <- dbs[,c(9, 11, 13)]
dbs <- sapply(dbs, function(x){gsub("gene_id|gene_name|gene_biotype| ", "", x)})
colnames(dbs) <- c("Geneid", "Gene", "Class")
dbs <- data.frame(dbs)
Top 5 genes read counts
SRR1958165 SRR1958166 SRR1958167 SRR1958168 SRR1958169 SRR1958170
ENSG00000227232 2 1 3 7 10 5
ENSG00000238009 1 0 0 0 0 1
ENSG00000233750 2 0 2 3 1 1
ENSG00000268903 36 43 30 33 29 37
ENSG00000269981 39 28 38 52 43 19
Experimental design
samples labels condition
SRR1958165 control1 control
SRR1958166 control2 control
SRR1958167 FOXA2KD1 FOXA2
SRR1958168 FOXA2KD2 FOXA2
SRR1958169 DEANR1KD1 DEANR1
SRR1958170 DEANR1KD2 DEANR1
Top 5 genes in the annotation table
Geneid Gene Class
ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene
ENSG00000227232 WASH7P unprocessed_pseudogene
ENSG00000278267 MIR6859-1 miRNA
ENSG00000243485 MIR1302-2HG lncRNA
ENSG00000284332 MIR1302-2 miRNA

2. Simple exploration of the raw data

Summary for read counts
Min. 1st Qu. Median Mean 3rd Qu. Max.
SRR1958165 0 1 8 548.4 317.0 130506
SRR1958166 0 1 8 573.6 322.0 199856
SRR1958167 0 1 7 595.7 298.0 146246
SRR1958168 0 1 7 556.1 293.0 182955
SRR1958169 0 1 7 583.6 300.0 150419
SRR1958170 0 0 7 574.0 288.5 261800
barplot(colSums(countData)/1e6, col="#66BD63",las=1,main="Total read counts (millions)", ylab="Total reads in millions")

transformation <- paste0("Log_2(",colnames(countData)[1],")")
hist(log2(countData[,1]),br=200, xlab = TeX(transformation), col="#08519C",
     main = TeX(paste0("Histogram of ",transformation)))

Simple log transformation on unnormalized data

logCountData = log2(1 + countData)
par(mfrow = c(1, 2), mar = c(8,2,2,2))  # two columns
hist(logCountData[,1], xlab = TeX(transformation),
     main = TeX(paste0("Histogram of ",transformation)))
boxplot(logCountData,las=3)

x <- logCountData
myColors <- brewer.pal(dim(x)[2],"Spectral")#rainbow(dim(x)[2])
plot(density(x[,1]),col = myColors[1], lwd=2,
  xlab="Expression values", ylab="Density", main= "Distribution of transformed data",
  ylim=c(0, max(density(x[,1])$y)+.02 ) )

for(i in 2:dim(x)[2] )
lines(density(x[,i]),col=myColors[i], lwd=2 )
  legend("topright", cex=1.1,colnames(x), lty=rep(1,dim(x)[2]), col=myColors)

xlab <- paste0("Log_2(",colnames(countData)[1],")")
ylab <- paste0("Log_2(",colnames(countData)[2],")")
plot(logCountData[,1],logCountData[,2],
     xlab = TeX(xlab),
     ylab = TeX(ylab),
     main = TeX(paste0(ylab, " vs ",xlab)))

3. Filtering, normalization, and transformation using DESeq2

Using the condition column from the experimental design

condition <- exp_design$condition
countdata <- as.matrix(countData)
coldata <- data.frame(row.names=colnames(countdata), condition)
dds <- DESeqDataSetFromMatrix(countData=countdata, colData=coldata, design=~condition)

# Run the DESeq pipeline
dds <- DESeq(dds)

# Plot dispersions
plotDispEsts(dds, main="")

dds <- dds[ rowSums(counts(dds)) > 5, ]

rlog transformation

Regularized log transformation

# Regularized log transformation for clustering/heatmaps, etc
rld <- rlog(dds, blind = FALSE)
kable(head(assay(rld), 5), booktabs = TRUE) %>%
  kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
SRR1958165 SRR1958166 SRR1958167 SRR1958168 SRR1958169 SRR1958170
ENSG00000227232 2.1387750 2.1182630 2.1570646 2.2370556 2.2845222 2.2005411
ENSG00000238009 -1.4859428 -1.5003799 -1.5003351 -1.4998930 -1.5001465 -1.4855680
ENSG00000233750 0.5784663 0.5392201 0.5778457 0.5997311 0.5593197 0.5602591
ENSG00000268903 5.1091409 5.1839010 5.0213992 5.0898451 5.0185675 5.1467081
ENSG00000269981 5.1816560 5.0226782 5.1607847 5.3581017 5.2356108 4.9106334

VSD transformation

Variance-stabilizing transformation

vsd <- vst(dds, blind = FALSE)

kable(head(assay(vsd), 5), booktabs = TRUE) %>%
  kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
SRR1958165 SRR1958166 SRR1958167 SRR1958168 SRR1958169 SRR1958170
ENSG00000227232 7.978207 7.938153 8.006709 8.099521 8.143688 8.061025
ENSG00000238009 7.939243 7.845119 7.845119 7.845119 7.845119 7.941747
ENSG00000233750 7.978207 7.845119 7.977079 8.011787 7.939686 7.941747
ENSG00000268903 8.406419 8.450830 8.353740 8.394871 8.351862 8.429002
ENSG00000269981 8.429036 8.335134 8.416776 8.532870 8.460670 8.264907
dds <- estimateSizeFactors(dds)
kable(t(sizeFactors(dds)), booktabs = TRUE) %>%
  kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
SRR1958165 SRR1958166 SRR1958167 SRR1958168 SRR1958169 SRR1958170
1.021349 1.045443 1.038893 0.9764818 1.011798 0.9690877

Started log on scaled data

Usings the normalized=TRUE option in the counts() method of DESeq2, we adjust for different library sizes.

slog <- log2(counts(dds, normalized=TRUE)+1)
kable(head(slog, 5), booktabs = TRUE) %>%
  kable_styling(latex_options = c("centered","striped","HOLD_position","scale_down"))
SRR1958165 SRR1958166 SRR1958167 SRR1958168 SRR1958169 SRR1958170
ENSG00000227232 1.5647169 0.968299 1.958913 3.030088 3.4440569 2.622811
ENSG00000238009 0.9848425 0.000000 0.000000 0.000000 0.0000000 1.022828
ENSG00000233750 1.5647169 0.000000 1.548499 2.025828 0.9915642 1.022828
ENSG00000268903 5.1798097 5.396807 4.900958 5.120800 4.8905366 5.292054
ENSG00000269981 5.2922220 4.796126 5.231794 5.761615 5.4428972 4.364997
par(mfrow = c(1, 3))  # 3 columns
plot(slog[,1],slog[,2])
plot(assay(rld)[,1],assay(rld)[,2])
plot(assay(vsd)[,1],assay(vsd)[,2])
df <- bind_rows(
  as_tibble(slog[,1:2]) %>%
         mutate(transformation = "log2(x + 1)"),
  as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"),
  as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))

colnames(df)[1:2] <- c("x", "y")  

ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
  coord_fixed() + facet_grid( . ~ transformation)

4. Exploratory Data Analysis

Sample distance heatmap

mycols <- brewer.pal(length(unique(condition)), "YlGnBu")[1:length(unique(condition))]
mycols[1] <- "#D7191C"

# Sample distance heatmap
sampleDists <- as.matrix(dist(t(assay(rld))))

heatmap.2(as.matrix(sampleDists), key=F, trace="none",
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[condition], RowSideColors=mycols[condition],
          margin=c(10, 10), main="Sample Distance Matrix")

Principal Component Analysis (PCA)

pcaData <- plotPCA(rld, intgroup = c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=condition)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  geom_text_repel(aes(PC1, PC2, label = rownames(pcaData)), pcaData) +
  scale_color_manual(name="Condition", values = mycols) +
  theme(aspect.ratio=1)

K-means clustering of genes

k <- 5 # Number of clusters
n <- 100 # number of top genes by standard deviation
x <- assay(rld)
if(n>dim(x)[1]) n = dim(x)[1] # max as data
x <- x[order(apply(x,1,sd),decreasing=TRUE),]  # sort genes by standard deviation
x <- x[1:n,]   # only keep the n genes

x <- 100* x[1:n,] / apply(x[1:n,],1,function(y) sum(abs(y))) # L1 norm

set.seed(2)
#k = input$k  # number of clusters

cl <- kmeans(x,k,iter.max = 50)

hc <- hclust(dist(cl$centers-apply(cl$centers,1,mean) )  )
tem <- match(cl$cluster,hc$order) #  new order
x <- x[order(tem),]
bar <- sort(tem)

# heatmap with color bar define gene groups
myheatmap2 <- function (x,bar,n=-1) {
    # number of genes to show
  ngenes = as.character(table(bar))
    if(length(bar) >n && n != -1) {ix = sort( sample(1:length(bar),n) ); bar = bar[ix]; x = x[ix,]}

    # this will cutoff very large values, which could skew the color
  gene_names <- rownames(x)
    x <- as.matrix(x)-apply(x,1,mean)

    cutoff <- median(unlist(x)) + 3*sd (unlist(x))
    x[x>cutoff] <- cutoff
    cutoff <- median(unlist(x)) - 3*sd (unlist(x))
    x[x< cutoff] <- cutoff
    #colnames(x)= detectGroups(colnames(x))
  #row.names(x) <- gene_names
    heatmap.2(x,  Rowv =F,Colv=F, dendrogram ="none",
              col=colorpanel(75, "black", "white"),
              #col=greenred(75),
              labRow = gene_names,
              cexRow=0.6,
              density.info="none",
              trace="none", scale="none", keysize=.3,
              key=F,RowSideColors = mycolors[bar],
              margins = c(8, 24), srtCol=45
    )
    legend.text = paste("Cluster ", toupper(letters)[unique(bar)], " (N=", ngenes,")", sep="")
    par(lend = 1)           # square line ends for the color legend
    legend("topright",      # location of the legend on the heatmap plot
        legend = legend.text, # category labels
        col = mycolors,  # color key
        lty= 1,             # line style
        lwd = 10            # line width
    )
}

mycolors <- brewer.pal(k,"YlGnBu")
mycolors[1] <- "#D7191C"
myheatmap2(x-apply(x,1,mean), bar,1000)



 

Developed by Roberto Villegas-Diaz

Villegas.Roberto@hotmail.com